XIncludeFilter xref

View Javadoc

1   /*--
2    Copyright 2003 Jan Pavlovic and Tomas Pitner.
3    Masaryk University in Brno, Czech Republic
4    All rights reserved.
5    This code is based on the work copyrighted by
6    Copyright 2001-2003 Elliotte Rusty Harold.
7    All rights reserved.
8    This file is part of XIncluder, a Java class library for integrating XInclude
9    processing with SAX, DOM, and JDOM.
10   `
11   XIncluder is free software; you can redistribute it and/or modify
12   it under the terms of the GNU Lesser General Public License version 2.1
13   as published by the Free Software Foundation.
14   XIncluder is distributed in the hope that it will be useful,
15   but WITHOUT ANY WARRANTY; without even the implied warranty of
16   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17   GNU Lesser General Public License for more details.
18   You should have received a copy of the GNU Lesser General Public License
19   along with XIncluder; if not, write to the Free Software
20   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
21   THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED
22   WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23   OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24   DISCLAIMED.  IN NO EVENT SHALL ELLIOTTE RUSTY HAROLD OR ANY
25   OTHER CONTRIBUTORS TO THIS PACKAGE
26   BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
29   USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
30   ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31   OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32   OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33   SUCH DAMAGE.
34   */
35  package net.sf.tomp.xml.include;
36  
37  import net.sf.tomp.xtcl.filter.XTFilterImpl;
38  
39  import org.apache.commons.logging.Log;
40  import org.apache.commons.logging.LogFactory;
41  import org.xml.sax.Attributes;
42  import org.xml.sax.EntityResolver;
43  import org.xml.sax.Locator;
44  import org.xml.sax.SAXException;
45  import org.xml.sax.XMLReader;
46  import org.xml.sax.helpers.AttributesImpl;
47  import org.xml.sax.helpers.NamespaceSupport;
48  import org.xml.sax.helpers.XMLReaderFactory;
49  
50  import java.io.BufferedInputStream;
51  import java.io.BufferedReader;
52  import java.io.IOException;
53  import java.io.InputStream;
54  import java.io.InputStreamReader;
55  import java.io.UnsupportedEncodingException;
56  
57  import java.net.MalformedURLException;
58  import java.net.URL;
59  import java.net.URLConnection;
60  
61  import java.util.Stack;
62  import java.util.regex.Matcher;
63  import java.util.regex.Pattern;
64  
65  /***
66   * Copyright 2003 Jan Pavlovic and Tomas Pitner. Masaryk University in Brno,
67   * Czech Republic All rights reserved. This code is based on the work
68   * copyrighted by Copyright 2001-2003 Elliotte Rusty Harold. All rights
69   * reserved.
70   * <p>
71   * This is a SAX filter which resolves all XInclude include elements before
72   * passing them on to the client application. Currently this class has the
73   * following known deviation from the XInclude specification:
74   * </p>
75   * <ol>
76   * <li>XPointer is not supported.</li>
77   * </ol>
78   * <p>
79   * Extensions made by JP and TP: <br/ > the URL of the included TEXT document
80   * (i.e. if <code>parse='text'</code>) can be in one of the following forms
81   * (lines are numbered starting at 1):
82   * <ul>
83   * <li><code>textfileurl#startLineNumber</code> will include only the line
84   * with startLineNumber <br/></li>
85   * <li><code>textfileurl#startLineNumber$count</code> will include count
86   * lines beginning from startLineNumber <br/></li>
87   * <li><code>textfileurl#startLineNumber-endLineNumber</code> will include
88   * all lines beginning from startLineNumber and ending to (including) line
89   * endLineNumber <br/></li>
90   * <li><code>textfileurl#startLineNumber-</code> will include all lines
91   * beginning from to the end of file <br/></li>
92   * <li><code>textfileurl#$countOfLines</code> or
93   * <code>textfileurl#-countOfLines</code> will include first countOfLines
94   * lines from the file <br/></li>
95   * </ul>
96   * TO-DO: check whether the xml:base-s really work for file: URLs <br/ > TO-DO:
97   * check whether the xml:base-s really work for file: URLs <br/>
98   * </p>
99   * <p>
100  * Extensions made by TP, Nov 2003: <br/ > Instead of specifying and start/end
101  * line and/or line count nummericaly, Java-style regexp patterns can be put in
102  * slashes instead of the first or last line number.
103  * <ul>
104  * <li><code>textfileurl#/startLineRegexp/</code> will include only the line
105  * with startLineRegexp <br/></li>
106  * <li><code>textfileurl#/startLineRegexp/$count</code> will include count
107  * lines beginning from start line <br/></li>
108  * <li><code>textfileurl#/startLineRegexp/-/endLineRegexp/</code> will
109  * include all lines beginning from start line and ending to line (including)
110  * matching the end pattern <br/></li>
111  * <li><code>textfileurl#/startLineRegexp/-endLineNo</code> will include all
112  * lines beginning from start line and ending to line (including) specified by
113  * <code>endLineNo</code> <br/></li>
114  * <li><code>textfileurl#startLineNo-/endLineRegexp/</code> will include all
115  * lines beginning from the start line number and ending to line (including)
116  * matching the end pattern <br/></li>
117  * <li><code>textfileurl#-/endLineRegexp/</code> will include all lines upto
118  * the line (including) matching the end pattern <br/></li>
119  * <li><code>textfileurl#/startLineRegexp/-</code> will include all lines
120  * beginning from to the end of file <br/></li>
121  * </ul>
122  * </p>
123  * <p>
124  * Furthermore, I would definitely use a new instance of this class for each
125  * document you want to process. I doubt it can be used successfully on multiple
126  * documents. Furthermore, I can virtually guarantee that this class is not
127  * thread safe. You have been warned.
128  * </p>
129  * <p>
130  * Since this class is not designed to be subclassed, and since I have not yet
131  * considered how that might affect the methods herein or what other protected
132  * methods might be needed to support subclasses, I have declared this class
133  * final. I may remove this restriction later, though the use-case for
134  * subclassing is weak. This class is designed to have its functionality
135  * extended via a a horizontal chain of filters, not a vertical hierarchy of sub
136  * and superclasses.
137  * </p>
138  * <p>
139  * To use this class:
140  * </p>
141  * <ol>
142  * <li>Construct an <code>XIncludeFilter</code> object with a known base URL
143  * </li>
144  * <li>Pass the <code>XMLReader</code> object from which the raw document
145  * will be read to the <code>setParent()</code> method of this object.</li>
146  * <li>Pass your own <code>ContentHandler</code> object to the
147  * <code>setContentHandler()</code> method of this object. This is the object
148  * which will receive events from the parsed and included document.</li>
149  * <li>Optional: if you wish to receive comments, set your own
150  * <code>LexicalHandler</code> object as the value of this object's
151  * http://xml.org/sax/properties/lexical-handler property. Also make sure your
152  * <code>LexicalHandler</code> asks this object for the status of each comment
153  * using <code>insideIncludeElement</code> before doing anything with the
154  * comment.</li>
155  * <li>Pass the URL of the document to read to this object's
156  * <code>parse()</code> method</li>
157  * </ol>
158  * <p>
159  * e.g.
160  * </p>
161  * 
162  * <pre><code>
163  * XIncludeFilter includer = new XIncludeFilter(base);
164  * includer.setParent(parser);
165  * includer.setContentHandler(new SAXXIncluder(System.out));
166  * includer.parse(args[i]);
167  * </code>
168  </pre>
169  * 
170  * @author Elliotte Rusty Harold, Tomas Pitner, Jan Pavlovic
171  * @version 1.0, April 7, 2004
172  */
173 public class XIncludeFilter extends XTFilterImpl {
174     /***
175      * the delimiters for regular expressions used to specify the first or last
176      * line
177      */
178     protected static final String REGEXP_DELIMITERS = "/";
179 
180     /***
181      * the number of the first line to be included is specified in the URL after
182      * the <code>term1</code> String
183      */
184     protected static final String term1 = "#";
185 
186     /***
187      * the number of lines to be included is specified in the URL after the
188      * <code>term2</code> String
189      */
190     protected static final String term2 = "$";
191 
192     /***
193      * the number of the last line to be included is specified in the URL after
194      * the <code>term3</code> String
195      */
196     protected static final String term3 = "-";
197 
198     private static final String XINCLUDE_NAMESPACE = "http://www.w3.org/2001/XInclude";
199 
200     private Log log = LogFactory.getLog(AdaptiveXIncludeFilter.class);
201 
202     private Stack bases = new Stack();
203 
204     private Stack locators = new Stack();
205 
206     // necessary to throw away contents of non-empty XInclude elements
207     private int level = 0;
208 
209     private int depth = 0;
210 
211     private boolean atRoot = false;
212 
213     public String getXIncludeNamespace() {
214         return XINCLUDE_NAMESPACE;
215     }
216 
217     /*
218      * private EntityResolver resolver; public XIncludeFilter() { this(null); }
219      * public XIncludeFilter(EntityResolver resolver) { this.resolver =
220      * resolver; }
221      */
222 
223     // what if this isn't called????
224     // do I need to check this in startDocument() and push something
225     // there????
226     public void setDocumentLocator(Locator locator) {
227         //System.out.println("+++ADAPTIVEXINCLUDER.setDocumentLocator("+locator);
228         //System.out.println("+++ADAPTIVEXINCLUDER.documentLocator.getSystemId="+locator.getSystemId());
229         //if(locator == null || locator.getSystemId() == null) return;
230         super.setDocumentLocator(locator);
231 
232         locators.push(locator);
233 
234         if (locator == null) {
235             return;
236         }
237 
238         String base = locator.getSystemId();
239 
240         try {
241             // for some buggy :-) xmlreaders (or emmiters),
242             // locator.getSystemId() is null,
243             // and causes problems with bases.push(new URL(null))!!!
244             if (base != null) {
245                 bases.push(new URL(base));
246 
247                 //System.err.println(">>> setDocumentLocator:pushing
248                 // base="+base+" atRoot="+atRoot+" level="+level);
249             } else {
250                 // FIXME: workaround for null locator.getSystemId() - some base
251                 // must be pushed...
252                 //if(bases.size() == 0)
253                 //bases.push(new URL("file:my-url.txt"));
254                 log.debug(">>> " + this
255                         + "setDocumentLocator: trying to push base null in "
256                         + locator + ", sysId=" + locator.getSystemId());
257 
258                 //System.out.println(">>> setDocumentLocator:pushing base="+new
259                 // URL("file:my-url.txt")+" bases.size()="+bases.size());
260                 //else
261                 //   bases.push(bases.peek());
262             }
263         } catch (MalformedURLException e) {
264             e.printStackTrace();
265             throw new UnsupportedOperationException("Unrecognized SYSTEM ID: "
266                     + base);
267         }
268     }
269 
270     /***
271      * <p>
272      * This utility method returns true if and only if this reader is currently
273      * inside a non-empty include element. (This is <strong>not </strong> the
274      * same as being inside the node set which replaces the include element.)
275      * This is primarily needed for comments inside include elements. It must be
276      * checked by the actual LexicalHandler to see whether a comment is passed
277      * or not.
278      * </p>
279      * 
280      * @return boolean
281      */
282     public boolean insideIncludeElement() {
283         return level != 0;
284     }
285 
286     public void startElement(String uri, String localName, String qName,
287             Attributes atts) throws SAXException {
288         //System.out.println("ADAPTIVEXINCLUDER.startElement,
289         // localname="+localName+" uri="+uri);
290         if (level == 0) { // We're not inside an xi:include element
291 
292             String base = atts.getValue(NamespaceSupport.XMLNS, "base");
293             URL parentBase = (URL) bases.peek();
294             URL currentBase = parentBase;
295 
296             if (base != null) {
297                 try {
298                     currentBase = new URL(parentBase, base);
299                 } catch (MalformedURLException e) {
300                     throw new SAXException(
301                             "Malformed base URL: " + currentBase, e);
302                 }
303             }
304 
305             bases.push(currentBase); //System.out.println(">>> startElement
306 
307             if (uri.equals(getXIncludeNamespace())
308                     && localName.equals("include")) {
309                 // include external document
310                 String href = atts.getValue("href");
311 
312                 // Verify that there is an href attribute
313                 if (href == null) {
314                     throw new SAXException("Missing href attribute");
315                 }
316 
317                 String parse = atts.getValue("parse");
318 
319                 if (parse == null) {
320                     parse = "xml";
321                 }
322 
323                 if (parse.equals("text")) {
324                     String encoding = atts.getValue("encoding");
325 
326                     includeTextDocument(href, encoding);
327                 } else if (parse.equals("xml")) {
328                     String variant = atts.getValue("variant");
329 
330                     includeXMLDocument(href, variant);
331                 }
332 
333                 // Need to check this also in DOM and JDOM????
334                 else {
335                     throw new SAXException(
336                             "Illegal value for parse attribute: " + parse);
337                 }
338 
339                 level++;
340             } else {
341                 if (atRoot) {
342                     // add xml:base attribute if necessary
343                     AttributesImpl attsImpl = new AttributesImpl(atts);
344 
345                     attsImpl.addAttribute(NamespaceSupport.XMLNS, "base",
346                             "xml:base", "CDATA", currentBase.toExternalForm());
347                     atts = attsImpl;
348                     atRoot = false;
349                 }
350 
351                 super.startElement(uri, localName, qName, atts);
352             }
353         }
354     }
355 
356     public void endElement(String uri, String localName, String qName)
357             throws SAXException {
358         if (uri.equals(getXIncludeNamespace()) && localName.equals("include")) {
359             level--;
360         } else if (level == 0) {
361             bases.pop();
362             super.endElement(uri, localName, qName);
363         }
364 
365         //System.out.println("+++ADAPTIVEXINCLUDER.endElement,
366         // localname="+localName);
367     }
368 
369     public void startDocument() throws SAXException {
370         //System.out.println("+++ADAPTIVEXINCLUDER.startDocument");
371         level = 0;
372 
373         if (depth == 0) {
374             super.startDocument();
375         }
376 
377         depth++;
378     }
379 
380     public void endDocument() throws SAXException {
381         depth--;
382 
383         if (depth == 0) {
384             //System.out.println("+++trying to do
385             // ADAPTIVEXINCLUDER.endDocument");
386             try {
387                 //System.out.println("++++depth="+depth+"++++
388                 // "+this+".endDocument: locators#"+locators.size()+"
389                 // .pop()="+locators);
390                 locators.pop();
391             } catch (java.util.EmptyStackException ese) {
392                 log
393                         .error(
394                                 "+++ADAPTIVEXINCLUDER.endDocument caused Empty stack in locators",
395                                 ese);
396             }
397 
398             try {
399                 Object p = bases.pop(); // pop the URL for the document itself
400             } catch (java.util.EmptyStackException ese) {
401                 throw ese;
402 
403                 //System.out.println("+++ADAPTIVEXINCLUDER.endDocument caused
404                 // Empty stack in bases");
405             }
406 
407             //System.out.println("+++alive in ADAPTIVEXINCLUDER.endDocument");
408             super.endDocument();
409         }
410 
411         //System.out.println("+++ADAPTIVEXINCLUDER.endDocument");
412     }
413 
414     // how do prefix mappings move across documents????
415     public void startPrefixMapping(String prefix, String uri)
416             throws SAXException {
417         if (level == 0) {
418             super.startPrefixMapping(prefix, uri);
419         }
420     }
421 
422     public void endPrefixMapping(String prefix) throws SAXException {
423         if (level == 0) {
424             super.endPrefixMapping(prefix);
425         }
426     }
427 
428     public void characters(char[] ch, int start, int length)
429             throws SAXException {
430         // System.out.println("characters: "+new String(ch, start, length));
431         if (level == 0) {
432             super.characters(ch, start, length);
433         }
434 
435         //System.out.println("+++ADAPTIVEXINCLUDER.characters="+new String(ch,
436         // start, length));
437     }
438 
439     public void ignorableWhitespace(char[] ch, int start, int length)
440             throws SAXException {
441         if (level == 0) {
442             super.ignorableWhitespace(ch, start, length);
443         }
444     }
445 
446     public void processingInstruction(String target, String data)
447             throws SAXException {
448         if (level == 0) {
449             super.processingInstruction(target, data);
450         }
451     }
452 
453     public void skippedEntity(String name) throws SAXException {
454         if (level == 0) {
455             super.skippedEntity(name);
456         }
457     }
458 
459     // convenience method for error messages
460     private String getLocation() {
461         String locationString = "";
462         Locator locator = (Locator) locators.peek();
463         String publicID = "";
464         String systemID = "";
465         int column = -1;
466         int line = -1;
467 
468         if (locator != null) {
469             publicID = locator.getPublicId();
470             systemID = locator.getSystemId();
471             line = locator.getLineNumber();
472             column = locator.getColumnNumber();
473         }
474 
475         locationString = " in document included from " + publicID + " at "
476                 + systemID + " at line " + line + ", column " + column;
477 
478         return locationString;
479     }
480 
481     /***
482      * <p>
483      * This method reads URL and returns the regular expression to match on the
484      * first line.
485      * </p>
486      * 
487      * @param url URL of the document that will be read
488      * @return the regular expression to match on the first line.
489      */
490     protected static String getFirstLineRegexp(String url) {
491         int indexTerm1 = url.indexOf(term1) + 1; // position after #
492 
493         if (indexTerm1 == 0) { // if no # return 0
494 
495             return null;
496         }
497 
498         int separatorIndex = url.indexOf(term2); // from # until $
499 
500         if (separatorIndex < 0) {
501             separatorIndex = url.indexOf(term3); // from # until -
502         }
503 
504         if (separatorIndex < 0) {
505             separatorIndex = url.length(); // take whole URL
506         }
507 
508         if (indexTerm1 == separatorIndex) { // if #$ or #- return ""
509 
510             return null;
511         }
512 
513         // trim also the leading slashes
514         // http://something.com/somepage#/firstlineregexp/-/lastlineregexp/
515         // must give "firstlineregexp"
516         char begin = url.charAt(indexTerm1);
517 
518         if (REGEXP_DELIMITERS.indexOf(begin) >= 0) {
519             return url.substring(indexTerm1 + 1, separatorIndex - 1);
520         } else {
521             return null;
522         }
523     }
524 
525     /***
526      * <p>
527      * This method reads URL and returns the regular expression to match on the
528      * last line.
529      * </p>
530      * 
531      * @param url URL of the document that will be read
532      * @return the regular expression to match on the last line.
533      */
534     protected static String getLastLineRegexp(String url) {
535         // if no $ - return null
536         if (url.indexOf(term1) == -1) {
537             //System.err.println("url.indexOf(term1) == -1: no $ -");
538             return null;
539         }
540 
541         int indexTerm2 = url.indexOf(term2) + 1;
542         int indexTerm3 = url.indexOf(term3) + 1;
543 
544         //System.err.println("line-count: indexTerm2="+indexTerm2+",
545         // indexTerm3="+indexTerm3+", url.length="+url.length());
546         // if no $ neither - then include one line:
547         if ((indexTerm2 == 0) && (indexTerm3 == 0)) {
548             //System.err.println("indexTerm2 == 0 && indexTerm3 == 0 because no
549             // $ neither - ");
550             return null;
551         }
552 
553         // if - at the end of the url then include all lines to the end:
554         if (indexTerm3 == url.length()) {
555             //System.err.println("indexTerm3 == url.length() - at the end");
556             return null;
557         }
558 
559         String count = url.substring(indexTerm2).trim(); // from $ to end
560         String last = url.substring(indexTerm3).trim(); // from - to end
561 
562         // if '$' (count of lines) specified - no regexp returned!
563         if (indexTerm2 > 0) {
564             // System.err.println("line-count: parsing count='"+count+"'");
565             //System.err.println("indexTerm2 > 0 because $ present");
566             return null;
567 
568             // if '-' (last line number) specified
569             // - return the string last with leading/trailing slashes trimmed
570         } else {
571             // System.err.println("line-count: parsing last='"+last+"'");
572             char begin = last.charAt(last.length() - 1);
573 
574             if (REGEXP_DELIMITERS.indexOf(begin) >= 0) {
575                 return last.substring(1, last.length() - 1);
576             } else {
577                 //System.err.println("REGEXP_DELIMITERS.indexOf(begin) < 0 - no
578                 // regexp delimiter found ");
579                 return null;
580             }
581         }
582     }
583 
584     /***
585      * <p>
586      * This method reads URL and return the first line to read
587      * </p>
588      * 
589      * @param url URL of the document that will be read
590      * @return int
591      */
592     protected int getLineBegin(String url) {
593         int indexTerm1 = url.indexOf(term1) + 1; // position after #
594 
595         if (indexTerm1 == 0) { // if no # return 0
596 
597             return 0;
598         }
599 
600         int separatorIndex = url.indexOf(term2); // from # until $
601 
602         if (separatorIndex < 0) {
603             separatorIndex = url.indexOf(term3); // from # until -
604         }
605 
606         if (separatorIndex < 0) {
607             separatorIndex = url.length(); // take whole URL
608         }
609 
610         if (indexTerm1 == separatorIndex) { // if #$ or #- return 1
611 
612             return 1;
613         }
614 
615         String begin = url.substring(indexTerm1, separatorIndex).trim();
616 
617         try {
618             // System.err.println("line-begin: parsing begin='"+begin+"'");
619             return Integer.parseInt(begin);
620         } catch (NumberFormatException nfe) {
621             return 0;
622         }
623     }
624 
625     /***
626      * <p>
627      * This method reads URL and return the amount of line line to read
628      * </p>
629      * 
630      * @param url URL of the document that will be read
631      * @return int
632      */
633     protected int getLineCount(String url) {
634         if (url.indexOf(term1) == -1) {
635             return Integer.MAX_VALUE;
636         }
637 
638         int indexTerm2 = url.indexOf(term2) + 1;
639         int indexTerm3 = url.indexOf(term3) + 1;
640 
641         //System.err.println("line-count: indexTerm2="+indexTerm2+",
642         // indexTerm3="+indexTerm3+", url.length="+url.length());
643         // if no $ neither - then include one line:
644         if ((indexTerm2 == 0) && (indexTerm3 == 0)) {
645             return 1;
646         }
647 
648         // if - at the end of the url then include all lines to the end:
649         if (indexTerm3 == url.length()) {
650             return Integer.MAX_VALUE;
651         }
652 
653         String count = url.substring(indexTerm2).trim(); // from $ to end
654         String last = url.substring(indexTerm3).trim(); // from - to end
655 
656         try {
657             // if '$' (count of lines) specified
658             if (indexTerm2 > 0) {
659                 // System.err.println("line-count: parsing count='"+count+"'");
660                 int iCount = Integer.parseInt(count);
661 
662                 return iCount;
663 
664                 // if '-' (last line number) specified
665             } else {
666                 // System.err.println("line-count: parsing last='"+last+"'");
667                 int iLast = Integer.parseInt(last);
668 
669                 //System.err.println("line-count returning iLast + 1 -
670                 // getLineBegin(url)=" +(iLast + 1 - getLineBegin(url)));
671                 return (iLast + 1) - getLineBegin(url);
672             }
673         } catch (NumberFormatException nfe) {
674             return 0;
675         }
676     }
677 
678     protected boolean isLastLineNo(String url) {
679         if (url.indexOf(term1) == -1) {
680             return false;
681         }
682 
683         int indexTerm2 = url.indexOf(term2) + 1; // $
684         int indexTerm3 = url.indexOf(term3) + 1; // -
685 
686         if ((indexTerm2 == 0) && (indexTerm3 == 0)) {
687             return false;
688         }
689 
690         // if - at the end of the url then include all lines to the end:
691         if (indexTerm3 == url.length()) {
692             return false;
693         }
694 
695         String count = url.substring(indexTerm2).trim(); // from $ to end
696         String last = url.substring(indexTerm3).trim(); // from - to end
697 
698         try {
699             // if '$' (count of lines) specified
700             if (indexTerm2 > 0) {
701                 return false;
702 
703                 // if '-' (last line number) specified
704             } else {
705                 // System.err.println("line-count: parsing last='"+last+"'");
706                 int iLast = Integer.parseInt(last);
707 
708                 return true;
709             }
710         } catch (NumberFormatException nfe) {
711             return false;
712         }
713     }
714 
715     /***
716      * <p>
717      * This utility method reads a document at a specified URL and fires off
718      * calls to <code>characters()</code>. It's used to include files with
719      * <code>parse="text"</code>
720      * </p>
721      * 
722      * @param url URL of the document that will be read
723      * @param encoding Encoding of the document; e.g. UTF-8, ISO-8859-1, etc.
724      * @throws SAXException if the requested document cannot be downloaded from
725      *             the specified URL or if the encoding is not recognized
726      */
727     private void includeTextDocument(String url, String encoding)
728             throws SAXException {
729         String firstLineRegexp = getFirstLineRegexp(url);
730         String lastLineRegexp = getLastLineRegexp(url);
731 
732         int begin = getLineBegin(url);
733         int count = getLineCount(url);
734 
735         boolean isLastLineNo = isLastLineNo(url);
736 
737         // System.err.println("include-text: begin="+begin+" count="+count);
738         // if count=0, i.e. include no lines, then return immediately
739         if (((count == 0) && (lastLineRegexp == null))
740                 || (isLastLineNo && (count == 1))) {
741             //System.err.println("count == 0");
742             return;
743         }
744 
745         if ((encoding == null) || encoding.trim().equals("")) {
746             encoding = "UTF-8";
747         }
748 
749         URL source;
750 
751         try {
752             URL base = (URL) bases.peek();
753 
754             source = new URL(base, url);
755         } catch (MalformedURLException e) {
756             UnavailableResourceException ex = new UnavailableResourceException(
757                     "Unresolvable URL " + url + getLocation());
758 
759             ex.setRootCause(e);
760             throw new SAXException("Unresolvable URL " + url + getLocation(),
761                     ex);
762         }
763 
764         try {
765             URLConnection uc = source.openConnection();
766             InputStream in = new BufferedInputStream(uc.getInputStream());
767             String encodingFromHeader = uc.getContentEncoding();
768             String contentType = uc.getContentType();
769 
770             if (encodingFromHeader != null) {
771                 encoding = encodingFromHeader;
772             } else {
773                 // What if file does not have a MIME type but name ends in
774                 // .xml????
775                 // MIME types are case-insensitive
776                 // Java may be picking this up from file URL
777                 if (contentType != null) {
778                     contentType = contentType.toLowerCase();
779 
780                     if (contentType.equals("text/xml")
781                             || contentType.equals("application/xml")
782                             || (contentType.startsWith("text/") && contentType
783                                     .endsWith("+xml"))
784                             || (contentType.startsWith("application/") && contentType
785                                     .endsWith("+xml"))) {
786                         encoding = EncodingHeuristics
787                                 .readEncodingFromStream(in);
788                     }
789                 }
790             }
791 
792             //System.err.println("dealing with begin="+begin);
793             //System.err.println("dealing with count="+count);
794             if ((begin != 0) || (firstLineRegexp != null)) {
795                 BufferedReader br = new BufferedReader(new InputStreamReader(
796                         in, encoding));
797                 String ls = System.getProperty("line.separator");
798 
799                 if (begin != 0) {
800                     // first line number specified
801                     String line = br.readLine();
802 
803                     while (line != null) {
804                         String newLine = br.readLine();
805 
806                         if ((--begin <= 0) && (--count >= 0)) {
807                             if ((count > 0) && (newLine != null)) {
808                                 line = line + ls;
809                             }
810 
811                             this.characters(line.toCharArray(), 0, line
812                                     .length());
813                         }
814 
815                         line = newLine;
816                     }
817                 } else {
818                     // firstLineRegexp != null
819                     //System.err.println("dealing with
820                     // firstLineRegexp="+firstLineRegexp);
821                     //System.err.println("dealing with
822                     // lastLineRegexp="+lastLineRegexp);
823                     boolean wasFirst = false;
824                     boolean beforeLast = true;
825 
826                     Pattern firstLinePattern = (firstLineRegexp == null) ? null
827                             : Pattern.compile(firstLineRegexp);
828                     Pattern lastLinePattern = (lastLineRegexp == null) ? null
829                             : Pattern.compile(lastLineRegexp);
830 
831                     if (firstLinePattern != null) {
832                         begin = Integer.MAX_VALUE;
833                     }
834 
835                     if (lastLinePattern != null) {
836                         count = Integer.MAX_VALUE;
837                     }
838 
839                     String line = br.readLine();
840 
841                     boolean testLastLineNo = (lastLinePattern == null)
842                             && isLastLineNo;
843                     int lastLineNo = count - 1;
844 
845                     while ((line != null) && beforeLast) {
846                         String newLine = br.readLine();
847 
848                         //System.err.println("\nline "+lastLineNo+".="+line);
849                         Matcher firstLineMatcher = (firstLinePattern == null) ? null
850                                 : firstLinePattern.matcher(line);
851                         Matcher lastLineMatcher = (lastLinePattern == null) ? null
852                                 : lastLinePattern.matcher(line);
853 
854                         boolean firstLineMatched = (firstLineMatcher == null) ? false
855                                 : firstLineMatcher.find();
856                         boolean lastLineMatched = (lastLineMatcher == null) ? false
857                                 : lastLineMatcher.find();
858 
859                         //System.err.println("firstLineMatcher.find()="+firstLineMatched);
860                         //System.err.println("lastLineMatcher.find()="+lastLineMatched);
861                         wasFirst |= ((--begin <= 0) || ((firstLineMatcher != null) && firstLineMatched));
862 
863                         /*
864                          * if( --count < 0) { System.err.println("falsing
865                          * beforeLast - count low"); beforeLast = false; }
866                          */
867                         if (wasFirst && beforeLast) {
868                             //if ((count > 0 || (lastLineMatcher != null &&
869                             // !lastLineMatched))
870                             if ((count > 0) && !lastLineMatched
871                                     && (!testLastLineNo || (lastLineNo > 1))
872                                     && (newLine != null)) {
873                                 line = line + ls;
874                             }
875 
876                             this.characters(line.toCharArray(), 0, line
877                                     .length());
878                             count--;
879                         }
880 
881                         if ((count <= 0) || lastLineMatched
882                                 || (testLastLineNo && (lastLineNo <= 1))) {
883                             //System.err.println("falsing beforeLast
884                             // count="+count+" lastLineMatched="
885                             //    +lastLineMatched+"
886                             // testLastLineNo="+testLastLineNo);
887                             beforeLast = false;
888                         }
889 
890                         line = newLine;
891                         lastLineNo--;
892                     }
893                 }
894             } else {
895                 InputStreamReader reader = new InputStreamReader(in, encoding);
896                 char[] c = new char[1024];
897 
898                 while (true) {
899                     int charsRead = reader.read(c, 0, 1024);
900 
901                     if (charsRead == -1) {
902                         break;
903                     }
904 
905                     if (charsRead > 0) {
906                         this.characters(c, 0, charsRead);
907                     }
908                 }
909             }
910         } catch (UnsupportedEncodingException e) {
911             throw new SAXException("Unsupported encoding: " + encoding
912                     + getLocation(), e);
913         } catch (IOException e) {
914             throw new SAXException("Document not found: "
915                     + source.toExternalForm() + getLocation(), e);
916         }
917     }
918 
919     /***
920      * <p>
921      * This utility method reads a document at a specified URL and fires off
922      * calls to various <code>ContentHandler</code> methods. It's used to
923      * include files with <code>parse="xml"</code>
924      * </p>
925      * 
926      * @param url URL of the document that will be read
927      * @param variant DOCUMENT ME!
928      * @throws SAXException if the requested document cannot be downloaded from
929      *             the specified URL.
930      */
931     protected void includeXMLDocument(String url, String variant)
932             throws SAXException {
933         log.info("XI:INCLUDE href=" + url + " in " + this);
934         log.debug("bases=" + bases);
935 
936         URL source;
937 
938         try {
939             URL base = (URL) bases.peek();
940 
941             source = new URL(base, url);
942         } catch (MalformedURLException e) {
943             UnavailableResourceException ex = new UnavailableResourceException(
944                     "Unresolvable URL " + url + getLocation());
945 
946             ex.setRootCause(e);
947             throw new SAXException("Unresolvable URL " + url + getLocation(),
948                     ex);
949         }
950 
951         try {
952             // make this more robust
953             XMLReader parser = getXMLReader(variant);
954 
955             if (parser == null) {
956                 return;
957             }
958 
959             parser.setContentHandler(this);
960 
961             EntityResolver resolver = this.getEntityResolver();
962 
963             if (resolver != null) {
964                 parser.setEntityResolver(resolver);
965             }
966 
967             // also other handlers need to be set!
968             parser.setDTDHandler(this);
969             parser.setEntityResolver(this);
970             parser.setErrorHandler(this);
971 
972             try {
973                 parser.setProperty(
974                         "http://xml.org/sax/properties/lexical-handler", this);
975             } catch (SAXException se) {
976                 se.printStackTrace();
977             }
978 
979             // save old level and base
980             int previousLevel = level;
981 
982             this.level = 0;
983 
984             if (bases.contains(source)) {
985                 Exception e = new CircularIncludeException(
986                         "Circular XInclude Reference to " + source
987                                 + getLocation() + ", bases=" + bases);
988 
989                 // FIXME: does not test circularities!
990                 throw new SAXException("Circular XInclude Reference", e);
991             }
992 
993             bases.push(source); //System.err.println(">>> pushing included to
994 
995             // base="+source);
996             atRoot = true;
997 
998             //System.out.println("%%%%%%%%%%%%% XIncludeFilter will parse
999             // "+source.toExternalForm());
1000             // source.toExternalForm()
1001             //Source src = new StreamSource(source.toString());
1002             //System.out.println("%%%%%%%%%%%%% will parse="+source);
1003             parser.parse(source.toExternalForm());
1004 
1005             // restore old level and base
1006             this.level = previousLevel;
1007 
1008             /* URL popped = (URL) */bases.pop(); //System.err.println("<<<
1009 
1010             // popping base="+popped);
1011         } catch (IOException e) {
1012             throw new SAXException("Document not found: "
1013                     + source.toExternalForm() + getLocation(), e);
1014         }
1015 
1016         //System.out.println("...XI:INCLUDED");
1017     }
1018 
1019     public XMLReader getXMLReader(String variant) throws SAXException {
1020         XMLReader parser = null;
1021 
1022         try {
1023             parser = XMLReaderFactory.createXMLReader();
1024         } catch (SAXException e) {
1025             parser = XMLReaderFactory.createXMLReader(
1026 
1027             //"org.apache.xerces.parsers.SAXParser"
1028                     "org.apache.crimson.parser.XMLReaderImpl");
1029         }
1030 
1031         return parser;
1032     }
1033 
1034     /*
1035      * public static void main(String args[]) throws Exception {
1036      * //System.out.println("firstlineregexp="+getFirstLineRegexp(args[0]));
1037      * //System.out.println("lastlineregexp="+getLastLineRegexp(args[0]));
1038      * XIncludeFilter f = new XIncludeFilter(); f.bases.push(new URL("file:."));
1039      * f.includeTextDocument(args[0], "Windows-1250"); }
1040      */
1041 }